MacAddict 108

home *** CD-ROM | disk | FTP | other *** search

/ MacAddict 108 / MacAddict108.iso / Software / Internet & Communication / JunkMatcher 1.5.5.dmg / JunkMatcher.app / Contents / Resources / Engine / HTMLBody.py < prev next >

Wrap

Python Source | 2005-06-01 | 10.8 KB | 287 lines

# # HTMLBody.py # JunkMatcher # # Created by Benjamin Han on 2/1/05. # Copyright (c) 2005 Benjamin Han. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #!/usr/bin/env python import htmlentitydefs, urllib from consts import * from utilities import * from GlobalObjects import * from HTMLEncoding import * _TAG_FOR_HIDDEN_URL = u'' _entityPat = re.compile(r'&#?\w+;') _tagPat = re.compile(r'<\s*([^>\s]+)([^>]*)>') _vacuousTagPat = re.compile(r'(?s)(\S)<([^>/\s]+)([^>]*)>\s*</([^>/\s]+)[^>]*>(\S)') _attrPat = re.compile(r'(?i)(?:([^"\'<>=\s]+)\s*=\s*["\']?\s*)?([^"\'<>\s]+)["\']?') _ELINKS_ARGS='-force-html -auto-submit 0 -dump 1 -no-home 1 -stdin 1 -dump-charset utf8' _htmlEncodingExtractor = HTMLEncodingExtractor() _htmlFormatter = HTMLFormatter() name2codepoint = htmlentitydefs.name2codepoint def _translateEntities (mo): en = mo.group(0) if en.startswith('&#'): if en[2] == 'x': try: return unichr(int(en[3:-1], 16)) except: return en else: try: return unichr(int(en[2:-1])) except: return en else: c = name2codepoint.get(en[1:-1]) if c is None: return en else: return unichr(c) class _ReplaceHiddenURL (object): # improving performance by not having __dict__ __slots__ = ('urlDict', 'spanList', 'base', 'attrs') def __init__ (self): self.urlDict = {} # a dict of valid (non-hidden) URLs self.spanList = [] # list of tuples (start,end) in the original HTML def __call__ (self, mo): """Must be called after self.base and self.attrs (a sets.Set) is set!""" attr = mo.group(1) value = mo.group(2).strip() httpMO = httpPat.search(value) if httpMO: # it's an URL if self.attrs is not None and attr is not None and attr.lower() in self.attrs: url = urllib.unquote(httpMO.group(0)).lower() # unquote the URL self.urlDict[url] = self.urlDict.get(url, 0) + 1 return '%s="%s"' % (attr, url) else: self.spanList.append((mo.start(0) + self.base, mo.end(0) + self.base)) return _TAG_FOR_HIDDEN_URL else: # it's not an URL if attr: return '%s="%s"' % (attr, value) else: return value class _ReplaceBadTag (object): # improving performance by not having __dict__ __slots__ = ('replaceHiddenURL', 'spanList') def __init__ (self): self.replaceHiddenURL = _ReplaceHiddenURL() self.spanList = [] def __call__ (self, mo): tag = mo.group(1).lower() base = mo.start(2) # for replaceHiddenURL.spanList self.replaceHiddenURL.base = base attributes = mo.group(2) if attributes is not None: attributes = attributes.lower() if len(attributes.strip()) == 0: attributes = None if tag[0] == '/': # it's an end tag if tag[1:] in globalObjects.htmlTags: if attributes is not None: # we don't need to unquote the http strings cuz we just want to identify # it's an URL for moIter in filter(lambda mo: httpPat.match(mo.group(2).strip()), _attrPat.finditer(attributes)): # yes the value is an URL self.replaceHiddenURL.spanList.append((moIter.start(2) + base, moIter.end(2) + base)) return '<%s>' % tag else: # a bad tag self.spanList.append((mo.start(0), mo.end(0))) return '' elif tag.startswith('!--'): # don't count comments as bad tags return '' # it's a starting tag attrs = globalObjects.htmlTags.get(tag, False) if attrs is not False: if attrs is None: # no URL is allowed in attributes if attributes is None: return '<%s>' % tag else: self.replaceHiddenURL.attrs = None attributes = _attrPat.sub(self.replaceHiddenURL, attributes) return '<%s %s>' % (tag, attributes) else: # URL is allowed only after a certain attribute name if attributes is None: return '<%s>' % tag else: # if attrs is '*' we don't check for hidden URLs if isinstance(attrs, sets.Set): self.replaceHiddenURL.attrs = attrs attributes = _attrPat.sub(self.replaceHiddenURL, attributes) return '<%s %s>' % (tag, attributes) else: # a bad tag self.spanList.append((mo.start(0), mo.end(0))) return '' class _ReplaceVacuousTag (object): # improving performance by not having __dict__ __slots__ = ('urlDict', 'spanList') def __init__ (self, urlDict): self.urlDict = urlDict self.spanList = [] def __call__ (self, mo): if mo.group(2).lower() == mo.group(4).lower(): # remove URL, if any, from urlDict httpMO = httpPat.search(mo.group(3)) if httpMO: url = httpMO.group(0).lower() if self.urlDict.has_key(url): self.urlDict[url] -= 1 self.spanList.append((mo.end(1), mo.start(5))) return ''.join((mo.group(1), mo.group(5))) else: return mo.group(0) class HTMLBody (object): """ An HTML message body -------------------- I. the following are set by __init__() htmlSrc: the raw source, in Unicode encoding: the encoding of the HTML message (could be None) contentWithoutEntities: the content without entities, in Unicode contentWithoutBadTags: the content without bad tags/hidden URLs, in Unicode content: final content without vacuous tags, in Unicode urlDict: a dict of (URL, count) tuples; URLs are unquoted and count could be 0 hiddenURLList: a list of (start, end) index tuples for identified hidden URLs (w.r.t. contentWithoutEntities) badTagList: a list of (start, end) index tuples for identified bad tags (w.r.t. contentWithoutEntities) vacuousTagList: a list of (start, end) index tuples for identified vacuous tags (w.r.t. contentWithoutBadTags) II. others: rendering: set by setRendering() (via elinks). """ # improving performance by not having __dict__ __slots__ = ('htmlSrc', 'encoding', 'contentWithoutEntities', 'contentWithoutBadTags', 'content', 'urlDict', 'hiddenURLList', 'badTagList', 'vacuousTagList', 'rendering') def __init__ (self, htmlSrc, defaultEncoding = None): """htmlSrc must be raw data (no encoding).""" # get the encoding (charset in the meta tag) encoding = _htmlEncodingExtractor.extract(htmlSrc) if encoding: # correct possible mispellings mispelling = charsetMispellings.get(encoding) if mispelling: encoding = mispelling self.encoding = encoding else: self.encoding = defaultEncoding # self.encoding can fall back to the defaultEncoding, # but encoding is *extracted* from htmlSrc, which can be None htmlSrc = decodeText(htmlSrc, self.encoding) # decode into Unicode self.htmlSrc = _htmlFormatter.format(htmlSrc, # rewrite/insert charset encoding is None, _htmlEncodingExtractor.hasTagHTML, _htmlEncodingExtractor.hasTagHead) htmlSrc = _entityPat.sub(_translateEntities, htmlSrc).strip() # rid of entities # At this point: both htmlSrc and self.htmlSrc are in Unicode; their differences: # 1. self.htmlSrc keeps all entities, but htmlSrc doesn't # 2. in the meta tag, self.htmlSrc always has charset=utf8, but htmlSrc keeps the # original. # 3. self.htmlSrc will be fed to elinks (which can't deal with lots of charsets), # and htmlSrc will be used to produce other content* attributes self.contentWithoutEntities = htmlSrc # cleaning bad tags and hidden URLs replaceBadTag = _ReplaceBadTag() self.contentWithoutBadTags = _tagPat.sub(replaceBadTag, htmlSrc).strip() self.hiddenURLList = replaceBadTag.replaceHiddenURL.spanList self.urlDict = replaceBadTag.replaceHiddenURL.urlDict self.badTagList = replaceBadTag.spanList # removing vacuous tags replaceVacuousTag = _ReplaceVacuousTag(self.urlDict) self.content = _vacuousTagPat.sub(replaceVacuousTag, self.contentWithoutBadTags).strip() self.vacuousTagList = replaceVacuousTag.spanList def setRendering (self): """Returns HTML rendering using elinks.""" if hasattr(self, 'rendering'): return self.rendering # invoke elinks try: #print encodeText(self.htmlSrc) elinksIn, elinksOut = os.popen2('"%s"elinks %s'%(BIN_PATH, _ELINKS_ARGS)) elinksIn.write(encodeText(self.htmlSrc)) elinksIn.close() self.rendering = decodeText(elinksOut.read(), 'utf8') # do NOT strip()! elinksOut.close() except: self.rendering = u'[executing elinks caused unknown errors]' if __name__ == '__main__': import sys if len(sys.argv) == 1: print 'Usage: ./HTMLBody.py <filename>' print ' * filename is the name of the file containing HTML raw source.' sys.exit(1) htmlBody = HTMLBody(open(sys.argv[1]).read()) print encodeText(htmlBody.content) htmlBody.setRendering() print encodeText(htmlBody.rendering)